In [129]:
#solrpy library: http://pythonhosted.org/solrpy/overview.html
import solr #to install: pip install solrpy
#pandas library for data processing - only needed to index the solr core, can be removed otherwise
import pandas as pd #to install: pip install pandas
#scikit-optimize library: https://github.com/scikit-optimize
import skopt #to install: pip install scikit-optimize

Settings


In [2]:
#Settings

# The files below are in the root folder of this GitHub repo. Launch jupyter notebook from that folder
# in order to read these files: 'jupyter notebook'

# Note: this is an artificial set of jobs, these are not real jobs, but are representative of our data
# Job descriptions are omitted, but usually we search that field also
jobs_data_file = "jobs.csv"

# File of relevancy judgements - these are highly subjective judgements, please don't take them too seriously
relevancy_file = "relevancy_judegements.csv"

#solr url and core (Jobs)
solr_url = "http://localhost:8983/solr/Jobs"

Load Jobs Data, Index in Solr Jobs Core


In [3]:
# Note: You can skip this section if you were able to load the Solr Jobs Core along with the data directory from the 
# './Solr Core and Config' sub-folder. Older versions of Solr won't read this data, so here's some code to populate 
# the index from the jobs.csv file

jobs_df = pd.read_csv(jobs_data_file, sep=",")
jobs_df["jobSkills"] = jobs_df["jobSkills"].apply(lambda sk: sk.split("|"))
# assign a unique doc id to each row
jobs_df["id"] = range(len(jobs_df))
jobs_df.head(5)


Out[3]:
jobTitle jobSkills employer city state geoCode id
0 Lead Developer [Project management, Java, Programming, QA] IT Services and Networking Corp. New York NY 40.7127837,-74.0059413 0
1 Cloud Developer [QA, Software engineering, Compiler, Network, ... Large Search Giant Llc. Des Moines IA 41.6005448,-93.6091064 1
2 Application Developer [J2EE, Oracle, XML, QA, jQuery, JDBC, BIND, IBM] Acme Inc Des Moines IA 41.6005448,-93.6091064 2
3 Application Developer [Programming, Lifecycle management, Network] IT Services and Networking Corp. Chicago IL 41.8781136,-87.6297982 3
4 Pega Developer [QA, Agile, Architecture] Scientists and Quants Inc New York NY 40.7127837,-74.0059413 4

In [4]:
solr_connection = solr.Solr(solr_url, persistent=True, timeout=360, max_retries=5)

# convert dataframe to a list of dictionaries (required solr client library document format)
docs = jobs_df.T.to_dict().values()

#wipe out any existing documents if present
solr_connection.delete_query("*:*")

# send documents
solr_connection.add_many(docs)

# hard commit and optimize
solr_connection.commit()
solr_connection.optimize()


Out[4]:
'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n<lst name="responseHeader"><int name="status">0</int><int name="QTime">0</int></lst>\n</response>\n'

Load Relevancy Judgements File


In [130]:
# The 'relevant' column is a list of document id's (the id field from the schema) that were both in the set of the top
# 20 returned documents, and were subjectively judged as relevant to the original
# query. We can subsequently use these to derive a MAP score for a given query

rel_df = pd.read_csv(relevancy_file, sep="|", converters={"fq": str, "location": str})
searches = rel_df.T.to_dict()
rel_df.head(3)


Out[130]:
query fq location relevant
0 java developer {!geofilt}&sfield=geoCode&pt=41.884251,-87.632... Chicago, IL 8,20,27,52,127,159,194,354,364,414,485,499,677...
1 data warehouse {!geofilt}&sfield=geoCode&pt=41.884251,-87.632... Chicago, IL 1078,1996,254,254,870,1968
2 web services {!geofilt}&sfield=geoCode&pt=40.7127837, -74.0... New York, NY 1342,1449,395,1272,1512,54,608,1528,38,84,150,...

In [154]:
# Takes a search id and a qf setting, and returns the list of doc ids, 
def get_results_for_search(sid, qf_value, rows):
    search = searches[sid]
    fq = ""
    pt = "0,0"
    
    if not search["location"].strip() == "" :
        splt = filter(lambda s: "pt=" in s, search["fq"].split("&"))
        if splt:
            pt = splt[0].replace("pt=","")
            fq = "{!geofilt}"

    resp = solr_connection.select(
       q=search["query"], 
       fields="id",
       start=0, rows=rows, 
       qf=qf_value, # comes from get_solr_params
       fq=fq,
       sfield="geoCode",
       pt=pt,
       score=False,
       d="48.00", wt="json")
    predicted = list(map(lambda res: res["id"], resp.results))
    # return predicted doc ids, along with relevent ones (for IR metric)
    return predicted, list(map(int, search["relevant"].split(",")))

IR Metrics


In [246]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : set
             A set of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mean_average_precision_at_k(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of sets of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def average_ndcg_at_k(actual, predicted, k, method=0):
    vals = [ ndcg_at_k(act, pred, k, method) for act, pred in zip(actual, predicted)]
    return np.mean(vals)

def ndcg_at_k(actual, predicted, k, method=0):
    
    # convert to ratings - actual relevant results give rating of 10, vs 1 for the rest
    act_hash = set(actual)    
    best_ratings = [ 10 for docid in actual ] + [1 for i in range(0, len(predicted) - len(actual))]
    pred_ratings = [ 10 if docid in act_hash else 1 for docid in predicted ]
        
    dcg_max = dcg_at_k(best_ratings, k, method)
    if not dcg_max:
        return 0.0
    dcg = dcg_at_k(pred_ratings, k, method)
    return dcg / dcg_max

def dcg_at_k(r, k, method=0):
    """
    Code taken from: https://gist.github.com/bwhite/3726239
    
    Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.

In [249]:
# Measure results for one set of qf settings
score = objective([3,1.5,1.1])
score # Score is negative, as scopt tries to minimize function output


Out[249]:
-0.74894708994708992

Black Box Optimization


In [250]:
# Function takes a list of 12 real numbers, and returns a set of solr configuration options
def get_solr_params(params):
    return {"qf" : "employer^{0}  jobTitle^{1}  jobskills^{2}".format(*params[0:3])
            #"pf2" :  "employer^{0}  jobTitle^{1}  jobSkills^{2}".format(*params[3:6]), 
            #"pf"  :  "employer^{0}  jobTitle^{1}  jobSkills^{2}".format(*params[6:9]) 
           }

In [270]:
# spit into training and test set of queries
sids = list(searches.keys())
cutoff = int(0.75* len(sids))
train_sids, test_sids = sids[:cutoff], sids[cutoff:]
train_sids, test_sids


Out[270]:
([0, 1, 2, 3, 4, 5, 6], [7, 8, 9])

In [271]:
# Precision cut off
PREC_AT = 20
# Black box objective function to minimize
# This is for the training data
def objective(params):
    # map list of numbers into solr parameters (just qf in this case)
    additional_params = get_solr_params(params)
    
    predicted, actual =[],[]
    for sid in train_sids:
        pred, act = get_results_for_search(sid, additional_params["qf"], PREC_AT)
        predicted.append(pred)
        actual.append(act)
    # Compute Mean average precision at 20
    return -1.0 * mean_average_precision_at_k(actual, predicted, PREC_AT)
    # Can also use NDCG - the version above is tailored for binary judegements
    #return -1.0 * average_ndcg_at_k(actual, predicted, PREC_AT)

# This is for the test data (held out dataset)
def evaluate(params):
    # map list of numbers into solr parameters (just qf in this case)
    additional_params = get_solr_params(params)
    
    predicted, actual =[],[]
    for sid in test_sids:
        pred, act = get_results_for_search(sid, additional_params["qf"], PREC_AT)
        predicted.append(pred)
        actual.append(act)
    # Compute Mean average precision at 20
    return -1.0 * mean_average_precision_at_k(actual, predicted, PREC_AT)

Run Optimizer Algorithm


In [257]:
# Example of how black box function is called to measure value of parameters (qf settings in this case)
score = objective([3, 2.5, 1.5])
# Score is negative as -1 * (IR metric), and the skopt library tries to find the parameters to minimize the score 
score


Out[257]:
-0.83492361342361343

In [267]:
# simple call back function to print progress while optimizing
def callback(res):
    call_no = len(res.func_vals)
    current_fun = res.func_vals[-1]
    print str(call_no).ljust(5) + "\t" + \
        str(-1.0* current_fun).ljust(20) + "\t" + str(map(lambda d: round(d,3), res.x_iters[-1]))

The code below runs the sci-kit optimization library and tries to find the set of parameters that minimize the objective function above. We are choosing to map the parameter values to qf values (field boosts), but you can in theory try any configuration setting here that you can test in this way. Some settings, such as changing the config files themselves can be accomplished with a core reload, or in some cases a server restart. Note however that you need the algorithm to run for quite a few iterations to learn effectively from your data, and for some problems, it may not be able to find a near optimal solution.


In [272]:
from skopt import gbrt_minimize
import datetime

ITERATIONS = 100 # probably want this to be high, 500 calls or more, set to a small value greater than 10 to test it is working
min_val, max_val = 0.0, 50.0
# min and max for each possible qf value (we read 3 in get_solr_params currently)
space  = [(min_val, max_val) for i in range(3)] 

start = datetime.datetime.now()
print "Starting at ", start
print "Run","\t", "Current MAP", "\t\t", "Parameters"
# run optimizer, which will try to minimize the objective function
res = gbrt_minimize(objective,       # the function to minimize
                  space,             # the bounds on each dimension of x
                  acq="LCB",         # controls how it searches for parameters
                  n_calls=ITERATIONS,# the number of evaluations of f including at x0
                  random_state=777,  # set to a fixed number if you want this to be deterministic
                  n_jobs=-1,         # how many threads (or really python processes due to GIL)
                  callback=callback) 

end = datetime.datetime.now()


Starting at  2016-10-13 13:37:47.798818
Run 	Current MAP 		Parameters
1    	0.789787257338      	[7.633, 4.669, 11.146]
2    	0.57634460211       	[15.118, 3.984, 9.322]
3    	0.789787257338      	[3.102, 29.481, 19.532]
4    	0.789787257338      	[22.993, 17.167, 9.658]
5    	0.789787257338      	[41.763, 49.444, 30.546]
6    	0.789787257338      	[46.35, 31.324, 44.14]
7    	0.789787257338      	[36.349, 34.089, 31.117]
8    	0.789787257338      	[38.425, 27.613, 12.656]
9    	0.789787257338      	[13.46, 13.443, 8.997]
10   	0.789787257338      	[32.201, 18.663, 40.82]
11   	0.789787257338      	[11.269, 15.497, 29.595]
12   	0.789787257338      	[20.547, 40.29, 35.926]
13   	0.789787257338      	[3.638, 11.26, 49.192]
14   	0.789787257338      	[31.253, 46.627, 37.928]
15   	0.789787257338      	[42.202, 48.694, 26.05]
16   	0.789787257338      	[26.195, 25.704, 30.834]
17   	0.789787257338      	[1.909, 49.891, 49.151]
18   	0.789787257338      	[15.706, 30.462, 27.224]
19   	0.789787257338      	[2.68, 8.532, 3.091]
20   	0.789787257338      	[3.708, 24.603, 38.959]
21   	0.728375454855      	[19.226, 9.157, 16.54]
22   	0.728375454855      	[26.191, 14.921, 39.56]
23   	0.789787257338      	[1.391, 2.871, 46.726]
24   	0.709719507628      	[12.891, 3.945, 27.206]
25   	0.57634460211       	[39.588, 4.306, 1.604]
26   	0.789787257338      	[3.249, 3.978, 16.225]
27   	0.789787257338      	[16.69, 48.174, 9.223]
28   	0.57634460211       	[9.509, 2.055, 6.204]
29   	0.57634460211       	[7.135, 0.578, 35.504]
30   	0.789787257338      	[0.001, 1.27, 15.887]
31   	0.57634460211       	[3.419, 0.325, 19.748]
32   	0.789787257338      	[48.939, 29.199, 48.733]
33   	0.57634460211       	[28.73, 5.601, 20.555]
34   	0.789787257338      	[45.542, 30.89, 37.488]
35   	0.789787257338      	[8.569, 22.665, 31.376]
36   	0.789787257338      	[6.646, 46.63, 35.397]
37   	0.789787257338      	[2.325, 17.756, 16.661]
38   	0.789787257338      	[25.798, 16.771, 13.985]
39   	0.789787257338      	[8.911, 6.325, 10.43]
40   	0.709719507628      	[6.497, 1.988, 12.637]
41   	0.789787257338      	[4.907, 5.265, 2.189]
42   	0.789787257338      	[1.865, 2.763, 15.76]
43   	0.728375454855      	[8.432, 4.514, 11.192]
44   	0.57634460211       	[44.589, 4.986, 10.523]
45   	0.789787257338      	[1.666, 4.527, 35.625]
46   	0.57634460211       	[5.675, 0.806, 18.293]
47   	0.789787257338      	[0.607, 10.608, 46.907]
48   	0.789787257338      	[32.919, 20.687, 42.497]
49   	0.789787257338      	[29.812, 21.06, 18.681]
50   	0.789787257338      	[26.974, 26.03, 13.369]
51   	0.789787257338      	[1.213, 2.867, 25.221]
52   	0.789787257338      	[3.311, 3.447, 5.755]
53   	0.789787257338      	[1.434, 2.976, 9.72]
54   	0.57634460211       	[3.805, 1.035, 39.879]
55   	0.789787257338      	[7.097, 30.008, 39.301]
56   	0.789787257338      	[25.475, 16.37, 47.728]
57   	0.789787257338      	[39.985, 28.122, 31.249]
58   	0.57634460211       	[37.455, 5.098, 13.536]
59   	0.789787257338      	[43.914, 36.802, 21.952]
60   	0.789787257338      	[32.556, 49.627, 31.495]
61   	0.789787257338      	[26.191, 27.714, 48.825]
62   	0.789787257338      	[27.228, 42.453, 0.878]
63   	0.789787257338      	[14.24, 39.124, 26.839]
64   	0.789787257338      	[16.463, 13.835, 40.497]
65   	0.728375454855      	[21.164, 10.936, 48.27]
66   	0.728375454855      	[20.867, 11.45, 23.094]
67   	0.57634460211       	[35.272, 5.012, 26.865]
68   	0.728375454855      	[45.792, 26.401, 11.6]
69   	0.789787257338      	[17.124, 34.076, 0.088]
70   	0.789787257338      	[4.036, 5.833, 0.333]
71   	0.789787257338      	[4.146, 4.596, 1.567]
72   	0.566862364771      	[49.238, 14.449, 49.105]
73   	0.728375454855      	[10.682, 5.287, 1.837]
74   	0.701782999691      	[15.613, 5.668, 1.407]
75   	0.57634460211       	[17.545, 2.875, 16.856]
76   	0.789787257338      	[47.88, 43.344, 46.422]
77   	0.701782999691      	[3.284, 1.233, 15.523]
78   	0.57634460211       	[6.458, 0.825, 11.828]
79   	0.789787257338      	[35.758, 47.713, 38.415]
80   	0.789787257338      	[16.539, 41.749, 25.976]
81   	0.789787257338      	[4.905, 25.428, 10.925]
82   	0.789787257338      	[0.335, 4.916, 37.995]
83   	0.789787257338      	[28.091, 20.628, 19.817]
84   	0.789787257338      	[2.956, 4.116, 32.588]
85   	0.789787257338      	[2.429, 1.435, 38.5]
86   	0.789787257338      	[3.053, 1.86, 38.95]
87   	0.789787257338      	[49.483, 33.301, 44.404]
88   	0.789787257338      	[49.118, 49.176, 23.353]
89   	0.789787257338      	[49.878, 33.831, 45.104]
90   	0.789787257338      	[18.099, 14.532, 49.317]
91   	0.572735380644      	[49.669, 14.049, 48.293]
92   	0.789787257338      	[0.822, 2.53, 7.718]
93   	0.789787257338      	[0.686, 2.419, 1.253]
94   	0.728375454855      	[3.094, 1.684, 11.845]
95   	0.789787257338      	[12.516, 13.298, 4.776]
96   	0.789787257338      	[1.97, 3.009, 9.946]
97   	0.572735380644      	[47.755, 13.42, 3.868]
98   	0.789787257338      	[2.278, 4.854, 12.789]
99   	0.789787257338      	[0.414, 3.942, 9.38]
100  	0.789787257338      	[0.06, 1.691, 8.329]

The evaluate function below is the same as the objective function, except it tests our newly optimized set of parameters on a different set of queries. This gives a more accurate measure of the performance of the new settings on data points and queries that were not in the training dataset.


In [273]:
# res.fun - function IR metric score (* -1), res.x - the best performing parameters
test_score = evaluate(res.x)
test_score


Out[273]:
-0.70588235294117652

The results from the training here are much higher than the test set. This is typical for a lot of machine learning \ optimization problems. If tuning an existing solr installation, you will want to ensure that the IR metrics score on the test set is better than the current production settings before releasing to production.


In [276]:
print("IR Metric @" + str(PREC_AT) + " Training Data = " +  str(-1 * res.fun))
print("IR Metric @" + str(PREC_AT) + " Test Data     = " +  str(-1 * test_score))
print("\nParameters:\n\t"),
print get_solr_params(res.x)["qf"]
print "\ngbrt_minimize took", (end - start).total_seconds(), "secs"


IR Metric @20 Training Data = 0.789787257338
IR Metric @20 Test Data     = 0.705882352941

Parameters:
	employer^7.63318674507  jobTitle^4.66866284146  jobskills^11.1464049498

gbrt_minimize took 14.664135 secs